# First replication file for Protest and Digital Adaptation (V3 MMAD)
# This file replicates the data generating process. 



####################################
#0. Load Required Packages
###################################
rm(list = ls())
library(RCurl)       #allows for direct download from Tor Metrics
library(tidyverse)
library(utils)       #to generate empty dataframe
library(countrycode) #to convert countrycodes
library(usethis)
library(devtools)
library(vdem)        #to directly load data from the VDem Project
library(lubridate)
library(readxl)
library(chron)

####################################
#1. Download Datasets
###################################

# Download Data on Tor Users directly via Tor Metrics as csv file
# (Data on Relay and Bridge usage is updated daily and varies by date of download)

download_relay <- getURL("https://metrics.torproject.org/userstats-relay-country.csv")
relay <- read.csv(text = download_relay, comment.char="#")
write.csv(relay, "relay.csv", row.names = FALSE)

download_bridge <- getURL("https://metrics.torproject.org/userstats-bridge-country.csv")
bridge <- read.csv (text = download_bridge, comment.char="#")
write.csv(bridge, "bridge.csv", row.names = FALSE)

# drop variables (fractions, lower and upper bounds of relay and bridge users)
relay <- dplyr::select(relay, date, country, users)
bridge <- dplyr::select(bridge, date, country, users)

# rename relay and bridge users in each file 
names(relay)[3] <- "users_relay"
names(bridge)[3] <- "users_bridge"


# Load MMAD datafile (Version 3)
events <- read.csv(file.path("events-v3.csv"), stringsAsFactors = F)
head(events)



#############################################
#2. Generate Empty Dataframe 
#############################################

# Generate empty dataframe on daily basis
df <- data.frame()
df <- expand.grid(seq(as.Date("2011-09-01"), as.Date("2018-12-31"), by = "day"),
                  levels(as.factor(events$cowcode)))

names(df)[1] <- "date"
names(df)[2] <- "country"
str(df)
df$country <- as.character(df$country)
df$year <- as.numeric(format(df$date,'%Y'))



################################################
#3. Aggregate Tor Data
################################################
# Tor Metrics provides daily data on the number of relay and bridge users. 
# As there are days within countries which show no
# record of Tor users, this observation is not available. 
# Therefore, we need to create an empty dataframe, which we fill
# with data on Tor users eventually. 

################################################
#3.a Modify relay data and join to empty framework
################################################
str(relay)
relay$date <- as.Date(relay$date)


#change ISO-2 Code to CoW Code
relay[[2]] <- toupper(relay[[2]])

#include Serbia
relay_serbia <- relay %>% filter(country=="RS")
relay_serbia$country <- 345

#change to CoW Code
relay$country <- countrycode(relay$country, origin = 'iso2c', destination = 'cown')
relay$country <- as.character(relay$country)

relay <- rbind(relay, relay_serbia)


#merge with empty data frame by day and country
df <- left_join(df, relay, by = c("date", "country"))


################################################
#3.b Modify bridge data and join to empty framework
################################################
str(bridge)
bridge$date <- as.Date(bridge$date)

#change ISO-2 Code to CoW Code
bridge[[2]] <- toupper(bridge[[2]])

#include Serbia
bridge_serbia <- bridge %>% filter(country=="RS")
bridge_serbia$country <- 345

#change to CoW Code
bridge$country <- countrycode(bridge$country, origin = 'iso2c', destination = 'cown')
bridge$country <- as.character(bridge$country)

bridge <- rbind(bridge, bridge_serbia)

#merge with empty data frame by day and country
df <- left_join(df, bridge, by = c("date", "country"))


################################################
#3.c Additional dependent variables 
################################################
df$users_relay[is.na(df$users_relay)] <- 0
df$users_bridge[is.na(df$users_bridge)] <- 0

#generate total number of tor users by adding up relay and bridge users
df <- df %>%
  mutate(users_total = rowSums(.[4:5],na.rm=T))


################################################
#4. Aggregate MMAD
################################################
# We only keep anti-government protest.
events <- filter(events, side == 1)
################################################
#4.a Aggregate protest data to daily observations
################################################
events$protest <- 1
str(events)
names(events)[1] <- "country"
names(events)[6] <- "date"
events$date <- as.Date(events$date)
events$country <- as.character(events$country)
# merge with empty data frame by day and country
# generate new variables: number of protest, maximum number of protest participants
# and whether violent state repression (governmental intervention) has occurred. 
events$day <- format(events$date, format="%Y-%m-%d")
p <- events %>%
  group_by(country, day) %>% 
  summarize(protest_number=sum(protest),
            max_numparticipants= max(mean_avg_numparticipants),
            max_secengagement = max(max_secengagement))
names(p)[2] <-"date"
p$date <- as.Date(p$date)
df <- left_join(df, p, by = c("date", "country"))
#we have about 4000 observations left in the given time frame 
p %>%
  filter(date > as.Date('2011-08-31'))
#generate dummy for protest occurrence
df$protest <- ifelse(df$protest_number >= 1,1,0)
df$protest[is.na(df$protest_number)] <- 0

#######################################
#4. We only keep pro-government protest.
#######################################
events <- read.csv(file.path("events-v3.csv"), stringsAsFactors = F)
head(events)
events_pro <- filter(events, side == 0)

################################################
#4.b Aggregate protest data to daily observations
################################################
events_pro$protest <- 1

str(events_pro)
names(events_pro)[1] <- "country"
names(events_pro)[6] <- "date"
events_pro$date <- as.Date(events_pro$date)
events_pro$country <- as.character(events_pro$country)

# merge with empty data frame by day and country
# generate new variables: number of protest, maximum number of protest participants, sum of protest participants,
# and whether violent state repression has occurred. 
events_pro$day <- format(events_pro$date, format="%Y-%m-%d")

p2 <- events_pro %>%
  group_by(country, day) %>% 
  summarize(pro_protest_number=sum(protest),
            pro_max_numparticipants= max(mean_avg_numparticipants),
            pro_max_secengagement = max(max_secengagement))

names(p2)[2] <-"date"
p2$date <- as.Date(p2$date)

df <- left_join(df, p2, by = c("date", "country"))


#generate dummy for protest occurrence
df$pro_protest <- ifelse(df$pro_protest_number >= 1,1,0)
df$pro_protest[is.na(df$pro_protest)] <- 0



################################################
#5. VDEM Data on Digital Media Freedom
################################################
# In addition, we use data on Digital Media Freedom provided by the Varieties of Democracy Project.
# In particular, we look at Government Internet filtering in practice and
# Government social media shut down in practice.
# Extract information directly via GitHub: https://github.com/xmarquez/vdem
# remotes::install_github("xmarquez/vdem")
vdem <- extract_vdem(section_number = 6)

vdem <- dplyr::select(vdem, year, GWn, v2smgovfilprc, v2smgovsmcenprc)
vdem$GWn <- countrycode(vdem$GWn, origin = 'gwn', destination = 'cown')

names(vdem) <- c("year", "country", "filter", "social_media")


################################################
#5.a Merge to data frame
################################################
vdem$country <- as.character(vdem$country)
df <- left_join(df, vdem, by = c("year", "country"))




################################################
#6. Include EMDAT Dataset 
################################################
emdat <- read_excel(file.path("emdat.xlsx"))
head(emdat)

#select variables
emdat <- dplyr::select(emdat, ISO, `Start Year`, `Start Month`, `Start Day`, 
                       `End Year`, `End Month`, `End Day`)
summary(emdat)

#collapse date variable (failed to parse for NAs)                      
emdat$startdate <- with(emdat, ymd(paste(`Start Year`, `Start Month`, `Start Day`, sep= ' ')))
emdat$enddate <- with(emdat, ymd(paste(`End Year`, `End Month`, `End Day`, sep= ' ')))
emdat <- dplyr::select(emdat, ISO, startdate, enddate)


#preparation to merge
names(emdat) <- c("country", "startdate", "enddate")
emdat$startdate <- as.Date(emdat$startdate, format="%Y-%m-%d")
emdat$enddate <- as.Date(emdat$enddate, format="%Y-%m-%d")
emdat$country <- countrycode(emdat$country, origin = 'iso3c', destination = 'cown')
emdat$country <- as.character(emdat$country)

#create list of days instead of time periods
#for this, we need a unique identifier
emdat <- emdat %>% mutate(id = row_number())
emdat <- emdat %>%
  filter(!startdate>enddate) #erase false observations where start date is greater than end date

#create sequence of disaster events
emdat <- emdat %>% 
  transmute(id=id, country = country, date = map2(startdate, enddate, ~seq(.x, .y, by = 'day'))) %>% 
  unnest

emdat <- dplyr::select(emdat, country, date)
emdat$disaster <- 1


#aggregate to one observation per day
emdat <- emdat %>%
  group_by(country, date) %>% 
  summarize(disaster_number=sum(disaster))

#join dataframes
df <- left_join(df, emdat, by = c("date", "country"))

#generate 0 instead of NAs
df$disaster <- ifelse(df$disaster_number >= 1,1,0)
df$disaster[is.na(df$disaster)] <- 0
df$disaster_number[is.na(df$disaster_number)] <- 0



###################################
#7. SCAD Data
####################################
scad_africa <- read.csv(file.path("SCAD2018Africa_Final.csv"), stringsAsFactors = F)
scad_latin <- read.csv(file.path("SCAD2018LatinAmerica_Final.csv"), stringsAsFactors = F)

###select respective variables 
scad_africa <- dplyr::select(scad_africa, eventid, id, ccode, startdate, enddate, etype, cgovtarget)
scad_latin <- dplyr::select(scad_latin, eventid, id, ccode, startdate, enddate, etype, cgovtarget)

###append new datasets
scad <- rbind(scad_africa,scad_latin)

###check and erase possible duplicates
scad$eventid[duplicated(scad$eventid)]
scad <- scad %>%
  distinct(eventid, .keep_all = TRUE)

###only include anti-government protests 
scad <- scad %>%
  filter(cgovtarget==1)

### select only organized demonstrations, spontaneous demonstration, organized violent riot,
### spontaneous violent riot, general strike, limited strike and anti government violence
scad <- scad %>%
  filter(etype==1 | etype==2 | etype==3 | etype==4 | etype==5 | etype==6 |etype==8)



###no need to arrange country codes: already CoW

###arrange date variables
scad$startdate <- as.Date(chron(format(as.Date(scad$startdate, "%d-%b-%Y"), "%m/%d/%y")))
scad$enddate <- as.Date(chron(format(as.Date(scad$enddate, "%d-%b-%Y"), "%m/%d/%y")))

#create sequence of events
scad <- scad %>%
  filter(!startdate>enddate)

###create sequence of events instead of from-to version
scad <- scad %>% 
  transmute(eventid=eventid, ccode = ccode, date = map2(startdate, enddate, ~seq(.x, .y, by = 'day'))) %>% 
  unnest


###preparation to merge
scad$protest <- 1  #add protest variable 
names(scad) <- c("id", "country", "date", "protest")

###aggregate to number of events per day
scad <- scad %>%
  group_by(country, date) %>% 
  summarize(scad_protestnumber=sum(protest))

###merge to dataset
scad$country <- as.character(scad$country)
df <- left_join(df, scad, by = c("date", "country"))

###add protest occurrence (depending on number of protests per day) - generate 0 instead of NAs
df$scad_protest <- ifelse(df$scad_protestnumber >= 1,1,0)
df$scad_protest [is.na(df$scad_protest )] <- 0




###################################
#8. Subset Country-periods in MMAD
####################################

# MMAD considers autocracies following the definition by Geddes, Wright and Frantz (2014)
# and by V-Dem (Version v9).
# Not all countries cover the time period defined as autocracies before 2011-08-28 or until 2018-12-31.
# For further specifications, please see MMAD Codebook: List of Country-periods in the MMAD.

df <- df %>%
  filter(!(country == 700 & date > as.Date('2014-06-14') & date < as.Date('2016-01-01')))%>%
  filter(!(country == 692 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 771 & date < as.Date('2013-01-01')))%>%
  filter(!(country == 571 & date > as.Date('2015-12-31')))%>%
  filter(!(country == 439 & date > as.Date('2015-12-31')))%>%
  filter(!(country == 516 & date < as.Date('2013-01-01')))%>%
  filter(!(country == 651 & date > as.Date('2012-06-30') & date < as.Date('2013-01-01')))%>%
  filter(!(country == 411 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 420 & date > as.Date('2017-12-31')))%>%
  filter(!(country == 372))%>%
  filter(!(country == 438 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 404 & date < as.Date('2013-01-01')))%>%
  filter(!(country == 404 & date > as.Date('2014-12-31') & date < as.Date('2018-01-01') ))%>%
  filter(!(country == 41 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 91 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 645 & date < as.Date('2017-01-01')))%>%
  filter(!(country == 437 & date > as.Date('2015-12-31')))%>%
  filter(!(country == 501 & date < as.Date('2017-01-01')))%>%
  filter(!(country == 703 & date > as.Date('2012-12-31') & date < as.Date('2016-01-01')))%>%
  filter(!(country == 703 & date > as.Date('2017-12-31')))%>%
  filter(!(country == 606 & date < as.Date('2018-01-01')))%>%
  filter(!(country == 606 & date < as.Date('2017-01-01') & date > as.Date('2017-12-31')))%>%
  filter(!(country == 450))%>%
  filter(!(country == 620 & date > as.Date('2012-12-31')))%>%
  filter(!(country == 343 & date < as.Date('2017-01-01') & date > as.Date('2017-12-31')))%>%
  filter(!(country == 580 & date > as.Date('2013-12-31') & date < as.Date('2016-01-01')))%>%
  filter(!(country == 565 & date > as.Date('2015-12-31')))%>%
  filter(!(country == 790))%>%
  filter(!(country == 93 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 436))%>%
  filter(!(country == 770 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 910 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 694 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 520 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 780 & date < as.Date('2013-01-01') & date > as.Date('2015-12-31')))%>%
  filter(!(country == 652 & date > as.Date('2012-12-31') & date < as.Date('2016-01-01')))%>%
  filter(!(country == 510 & date > as.Date('2017-12-31')))%>%
  filter(!(country == 800 & date < as.Date('2014-05-22')))%>%
  filter(!(country == 461 & date > as.Date('2015-12-31') & date < as.Date('2017-01-01')))%>%
  filter(!(country == 616 & date > as.Date('2012-12-31')))%>%
  filter(!(country == 640 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 369 & date < as.Date('2016-01-01')))%>%
  filter(!(country == 679 & date > as.Date('2012-12-31')))

##################################
#9. Export csv.file 
##################################
write.csv(df, gzfile("df.csv.gz"), row.names = FALSE)


